www.gusucode.com > wxApp PHP版微信小程序CMS系统 v1.0PHP源码程序 > wxApp PHP版微信小程序CMS系统 v1.0/wxAppCMS_v1.0.0/wxAppCMS_v1.0.0/app/spider/spider_content.class.php
<?php /** * iCMS - i Content Management System * Copyright (c) 2007-2017 iCMSdev.com. All rights reserved. * * @author icmsdev <master@icmsdev.com> * @site https://www.icmsdev.com * @licence https://www.icmsdev.com/LICENSE.html */ defined('iPHP') OR exit('What are you doing?'); class spider_content { public static $hash = null; /** * 抓取资源 * @param [string] $html [抓取结果] * @param [array] $data [数据项] * @param [array] $rule [规则] * @param [array] $responses [已经抓取资源] * @return [array] [返回处理结果] */ public static function crawl($html,$data,$rule,$responses) { @set_time_limit(0); if(trim($data['rule'])===''){ return ''; } $name = $data['name']; if (spider::$dataTest) { echo'<b>['.$name.']规则:</b>'.iSecurity::escapeStr($data['rule'])."<br />"; } /** * 在数据项里调用之前采集的数据[DATA@name][DATA@name.key] */ if(strpos($data['rule'], '[DATA@')!==false){ $content = spider_tools::getDATA($responses,$data['rule']); if(is_array($content)){ return $content; }else{ $data['rule'] = $content; } } /** * 在数据项里调用之前采集的数据RULE@规则id@url */ if(strpos($data['rule'], 'RULE@')!==false){ list($_rid,$_urls) = explode('@', str_replace('RULE@', '',$data['rule'])); empty($_urls) && $_urls = trim($html); if (spider::$dataTest) { print_r('<b>使用[rid:'.$_rid.']规则抓取</b>:'.$_urls); echo "<hr />"; } return spider_urls::crawl('DATA@RULE',false,$_rid,$_urls); } /** * RAND@10,0 * 返回随机数 */ if(strpos($data['rule'], 'RAND@')!==false){ $random = str_replace('RAND@', '',$data['rule']); list($length,$numeric) = explode(',', $random); return random($length, empty($numeric)?0:1); } if(is_array($html)){ $content = $html; }else{ $contentArray = array(); self::$hash = array(); $_content = spider_content::match($html,$data,$rule); $cmd5 = md5($_content); $contentArray[] = $_content; self::$hash[$cmd5] = spider::$url; $data['page'] && self::page_data($html,$data,$rule,$contentArray); $content = implode('#--iCMS.PageBreak--#', $contentArray); unset($contentArray,$_content); } unset($html); //遍历 例:FOREACH@<p><img src="[KEY@source]" />[KEY@add_intro]</p> // if(strpos($data['rule'], 'FOREACH@')!==false){ $data_rule = str_replace('FOREACH@', '', $data['rule']); preg_match_all('!.*?\[KEY@([\w-_]+)\].*?!ism', $data_rule,$matchs); $variable = array(); foreach ((array)$content as $key => $value) { foreach ((array)$matchs[1] as $i => $k) { if(isset($value[$k])){ $variable[$key][$k] = $value[$k]; } } } foreach ((array)$matchs[1] as $i => $k) { $search[] = '[KEY@'.$k.']'; } $contentArray = array(); foreach ($variable as $key => $value) { $contentArray[] = str_replace($search, $value, $data_rule); } $content = implode('#--iCMS.PageBreak--#', $contentArray); unset($contentArray,$variable); } if (spider::$dataTest) { print_r('<b>['.$name.']匹配结果:</b><div style="max-height:300px;overflow-y: scroll;">'.htmlspecialchars($content).'</div>'); echo "<hr />"; } if ($data['cleanbefor']) { $content = spider_tools::dataClean($data['cleanbefor'], $content); } if ($data['trim']) { if(is_array($content)){ $content = array_map('trim', $content); }else{ $content = str_replace(' ','',trim($content)); } } if ($data['json_decode']) { $content = json_decode($content,true); if(is_null($content)){ return self::msg( 'JSON ERROR:'.json_last_error_msg(), 'content.json_decode.error', $name,$rule ); } } if ($data['htmlspecialchars_decode']) { $content = htmlspecialchars_decode($content); } if(!is_array($content)){ $content = stripslashes($content); } if ($data['cleanhtml']) { $content = preg_replace('/<[\/\!]*?[^<>]*?>/is', '', $content); } if ($data['format'] && $content) { $content = autoformat($content); } if ($data['img_absolute'] && $content) { preg_match_all("/<img.*?src\s*=[\"|'](.*?)[\"|']/is", $content, $img_match); if($img_match[1]){ $_img_array = array_unique($img_match[1]); $_img_urls = array(); foreach ((array)$_img_array as $_img_key => $_img_src) { $_img_urls[$_img_key] = spider_tools::url_complement($rule['__url__'],$_img_src); } $content = str_replace($_img_array, $_img_urls, $content); } unset($img_match,$_img_array,$_img_urls,$_img_src); } if ($data['capture']) { $content && $content = spider_tools::remote($content); } if ($data['download']) { $content && $content = iFS::http($content); } if ($data['autobreakpage']) { $content = spider_tools::autoBreakPage($content); } if ($data['mergepage']) { $content = spider_tools::mergePage($content); } if ($data['cleanafter']) { $content = spider_tools::dataClean($data['cleanafter'], $content); } if ($data['filter']) { $fwd = iPHP::callback(array("filterApp","run"),array(&$content),false); if($fwd){ return self::msg( '中包含【'.$fwd.'】被系统屏蔽的字符!', 'content.filter', $name,$rule ); } } if ($data['empty']) { $empty = $content; is_array($content) && $empty = implode('', $content); $empty = self::real_empty($empty); if(empty($empty)){ return self::msg( '规则设置了不允许为空.当前抓取结果为空!请检查,规则是否正确!', 'content.empty', $name,$rule ); } unset($empty); } if (spider::$callback['content'] && is_callable(spider::$callback['content'])) { $content = call_user_func_array(spider::$callback['content'],array($content,$data)); } if($data['array']){ if(strpos($content, '#--iCMS.PageBreak--#')!==false){ $content = explode('#--iCMS.PageBreak--#', $content); } return (array)$content; } if($data['implode'] && is_array($content)){ $content = implode('', $content); } return $content; } public static function page_data($html,$data,$rule,&$contentArray){ if(empty($rule['page_url'])){ $rule['page_url'] = $rule['list_url']; } if (empty(spider::$allHtml)) { $page_url_array = array(); $page_area_rule = trim($rule['page_area_rule']); if($page_area_rule){ if(strpos($page_area_rule, 'DOM::')!==false){ iPHP::vendor('phpQuery'); $doc = phpQuery::newDocumentHTML($html,'UTF-8'); $pq_dom = str_replace('DOM::','', $page_area_rule); $pq_array = phpQuery::pq($pq_dom); foreach ($pq_array as $pn => $pq_val) { $href = phpQuery::pq($pq_val)->attr('href'); if($href){ if($rule['page_url_rule']){ if(strpos($rule['page_url_rule'], '<%')!==false){ $page_url_rule = spider_tools::pregTag($rule['page_url_rule']); if (!preg_match('|' . $page_url_rule . '|is', $href)){ continue; } }else{ $cleanhref = spider_tools::dataClean($rule['page_url_rule'],$href); if($cleanhref){ $href = $cleanhref; unset($cleanhref); }else{ continue; } } } $href = str_replace('<%url%>',$href, $rule['page_url']); $page_url_array[$pn] = spider_tools::url_complement($rule['__url__'],$href); } } phpQuery::unloadDocuments($doc->getDocumentID()); }else{ $page_area_rule = spider_tools::pregTag($page_area_rule); if ($page_area_rule) { preg_match('|' . $page_area_rule . '|is', $html, $matches, $PREG_SET_ORDER); $page_area = $matches['content']; } else { $page_area = $html; } if($rule['page_url_rule']){ $page_url_rule = spider_tools::pregTag($rule['page_url_rule']); preg_match_all('|' .$page_url_rule. '|is', $page_area, $page_url_matches, PREG_SET_ORDER); foreach ($page_url_matches AS $pn => $row) { $href = str_replace('<%url%>', $row['url'], $rule['page_url']); $page_url_array[$pn] = spider_tools::url_complement($rule['__url__'],$href); gc_collect_cycles(); } } unset($page_area); } }else{ // 逻辑方式 if($rule['page_url_parse']=='<%url%>'){ $page_url = str_replace('<%url%>',$rule['__url__'],$rule['page_url']); }else{ $page_url_rule = spider_tools::pregTag($rule['page_url_parse']); preg_match('|' . $page_url_rule . '|is', $rule['__url__'], $matches, $PREG_SET_ORDER); $page_url = str_replace('<%url%>', $matches['url'], $rule['page_url']); } if (stripos($page_url,'<%step%>') !== false){ for ($pn = $rule['page_no_start']; $pn <= $rule['page_no_end']; $pn = $pn + $rule['page_no_step']) { $pno = $pn; if($rule['page_no_fill']){ $pno = sprintf("%0".$rule['page_no_fill']."s",$pn); } $page_url_array[$pn] = str_replace('<%step%>', $pno, $page_url); gc_collect_cycles(); } } } //URL去重清理 if($page_url_array){ $page_url_array = array_filter($page_url_array); $page_url_array = array_unique($page_url_array); $puk = array_search($rule['__url__'],$page_url_array); if($puk!==false){ unset($page_url_array[$puk]); } } if (spider::$dataTest) { echo "<b>内容页网址:</b>".$rule['__url__'] . "<br />"; echo "<b>分页网址提取规则:</b>".iSecurity::escapeStr($page_url_rule). "<br />"; echo "<b>分页合成:</b>".$rule['page_url'] . "<br />"; echo "<hr />"; } if(spider::$dataTest){ echo "<b>分页列表:</b><pre>"; print_r($page_url_array); echo "</pre><hr />"; } if($data['page']){ spider::$content_right_code = ($data['dom']?'DOM::':'').$data['rule']; } $rule['page_url_right'] && spider::$content_right_code = trim($rule['page_url_right']); spider::$content_error_code = trim($rule['page_url_error']); if(spider::$dataTest){ echo "<b>有效分页特征码:</b>"; echo iSecurity::escapeStr(spider::$content_right_code); echo "<br />"; echo "<b>无效分页特征码:</b>"; echo iSecurity::escapeStr(spider::$content_error_code); echo "<hr />"; } $rule['proxy'] && spider::$curl_proxy = $rule['proxy']; $rule['data_charset'] && spider::$charset = $rule['data_charset']; $pageurl = array(); foreach ($page_url_array AS $pukey => $purl) { //usleep(100); $phtml = spider_tools::remote($purl); if (empty($phtml)) { break; } $md5 = md5($phtml); if($pageurl[$md5]){ if (spider::$dataTest) { echo "<b>{$purl}此分页已采过</b><hr />"; } continue; } $check_content_code = spider_tools::check_content_code($phtml,'error'); if ($check_content_code === false) { unset($check_content_code,$phtml); if (spider::$dataTest) { echo "<b>找到无效分页特征码,中止其它分页采集</b><hr />"; } break; } $check_content_code = spider_tools::check_content_code($phtml,'right'); if ($check_content_code === false) { unset($check_content_code,$phtml); if (spider::$dataTest) { echo "<b>未找到有效分页特征码,中止其它分页采集</b><hr />"; } break; } $_content = spider_content::match($phtml,$data,$rule); $cmd5 = md5($_content); $_purl = self::$hash[$cmd5]; if($_purl){ if (spider::$dataTest) { echo "<b>发现[{$purl}]正文与[{$_purl}]相同,跳过本页采集</b><hr />"; } continue; } $contentArray[] = $_content; $pageurl[$md5] = $purl; self::$hash[$cmd5] = $purl; spider::$allHtml[$md5] = $phtml; } gc_collect_cycles(); unset($check_content_code,$phtml); if (spider::$dataTest) { echo "<b>最终分页列表:</b><pre>"; print_r($pageurl); echo "</pre><hr />"; } }else{ foreach ((array)spider::$allHtml as $ahkey => $phtml) { $contentArray[] = spider_content::match($phtml,$data,$rule); } } } public static function real_empty($text){ $text = str_replace(array(' ',' '), '', $text); $text = preg_replace(array( '/\s*/','/\r*/','/\n*/', '@<p[^>]*>\s*<br[^>]*>\s*</p>@', '@<(\w+)>\s*<\$1>@', '@</*(p|strong|b|span)>@' ), '', $text); $text = trim($text); return $text; } public static function match($html,$data,$rule){ $match_hash = array(); if($data['dom']){ iPHP::vendor('phpQuery'); spider::$dataTest && $_GET['pq_debug'] && phpQuery::$debug =1; $html = preg_replace(array('/<script.+?<\/script>/is','/<style.+?<\/style>/is'),'',$html); $doc = phpQuery::newDocumentHTML($html,'UTF-8'); if(strpos($data['rule'], '@')!==false){ list($content_dom,$content_attr) = explode("@", $data['rule']); $content_fun = 'attr'; }else{ list($content_dom,$content_fun,$content_attr) = explode("\n", $data['rule']); } $content_dom = trim($content_dom); $content_fun = trim($content_fun); $content_attr = trim($content_attr); $content_fun OR $content_fun = 'html'; if ($data['multi']) { $conArray = array(); $_content = null; foreach ($doc[$content_dom] as $doc_key => $doc_value) { if($content_attr){ $_content = phpQuery::pq($doc_value)->$content_fun($content_attr); }else{ $_content = phpQuery::pq($doc_value)->$content_fun(); } $cmd5 = md5($_content); if($match_hash[$cmd5]){ break; } if ($data['trim']) { $_content = trim($_content); } if(empty($_content)){ $cmd5 = 'empty('.$doc_key.')'; }else{ $conArray[$doc_key] = $_content; } $match_hash[$cmd5] = true; } if (spider::$dataTest) { echo "<b>多条匹配结果:</b><pre>"; print_r($match_hash); echo "</pre><hr />"; } $content = implode('#--iCMS.PageBreak--#', $conArray); unset($conArray,$_content,$match_hash); }else{ if($content_attr){ $content = $doc[$content_dom]->$content_fun($content_attr); }else{ $content = $doc[$content_dom]->$content_fun(); } } phpQuery::unloadDocuments($doc->getDocumentID()); unset($doc); }else{ if(trim($data['rule'])=='<%content%>'){ $content = $html; }else{ $data_rule = spider_tools::pregTag($data['rule']); if (preg_match('/(<\w+>|\.\*|\.\+|\\\d|\\\w)/i', $data_rule)) { if ($data['multi']) { preg_match_all('|' . $data_rule . '|is', $html, $matches, PREG_SET_ORDER); $conArray = array(); foreach ((array) $matches AS $mkey => $mat) { $cmd5 = md5($mat['content']); if($match_hash[$cmd5]){ break; } if ($data['trim']) { $mat['content'] = trim($mat['content']); } if(empty($mat['content'])){ $cmd5 = 'empty('.$mkey.')'; }else{ $conArray[$mkey] = $mat['content']; } $match_hash[$cmd5] = true; } if (spider::$dataTest) { echo "<b>多条匹配结果:</b><pre>"; print_r($match_hash); echo "</pre><hr />"; } $content = implode('#--iCMS.PageBreak--#', $conArray); unset($conArray,$match_hash); } else { preg_match('|' . $data_rule . '|is', $html, $matches); $content = $matches['content']; } } else { $content = $data['rule']; } } } return $content; } public static function msg($msg,$type,$name,$rule){ $msg = '['.$name.']'.$msg; if(spider::$dataTest){ exit('<h1>'.$msg.'</h1>'); } if(spider::$work){ echo spider::errorlog($msg,$rule['__url__'],$type); echo "\n{$msg}\n"; return null; }else{ iUI::alert($msg); } } }